The data is from Kaggle and includes
# Original URL: https://www.kaggle.com/datasets/yasserh/walmart-dataset/data
store_data = read.csv("/Users/delracman/Documents/STAT 7500/Project/Walmart.csv") %>%
mutate(
Date=as.Date(Date,format="%d-%m-%Y"),
Year=year(Date),
Month=month(Date),
Day=day(Date),
Week=week(Date)
) %>%
arrange(Date)
head(store_data)
## Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
## 1 1 2010-02-05 1643690.9 0 42.31 2.572 211.0964
## 2 2 2010-02-05 2136989.5 0 40.19 2.572 210.7526
## 3 3 2010-02-05 461622.2 0 45.71 2.572 214.4249
## 4 4 2010-02-05 2135143.9 0 43.76 2.598 126.4421
## 5 5 2010-02-05 317173.1 0 39.70 2.572 211.6540
## 6 6 2010-02-05 1652635.1 0 40.43 2.572 212.6224
## Unemployment Year Month Day Week
## 1 8.106 2010 2 5 6
## 2 8.324 2010 2 5 6
## 3 7.368 2010 2 5 6
## 4 8.623 2010 2 5 6
## 5 6.566 2010 2 5 6
## 6 7.259 2010 2 5 6
There are peaks at similar times each year.
store_data %>%
group_by(Date) %>%
summarize(Total_Sales=sum(Weekly_Sales)) %>%
ggplot() +
geom_line(aes(x=Date,y=Total_Sales)) +
labs(title="Total Sales Each Week") +
scale_x_date(breaks = function(x) seq.Date(from = as.Date("2010-01-01"),
to = as.Date("2012-12-31"),
by = "4 months"))
# Think about way to replace axis labels with months instead of weeks
# (since beginning of the week corresponding with each number changes each year)
avgsales = store_data %>%
group_by(Year,Week) %>%
summarize(
avgsales=mean(Weekly_Sales),
scaled=avgsales/100000
)
avgsales %>%
ggplot() +
geom_line(aes(x=Week,y=scaled,color=factor(Year))) +
labs(
x="Week Number",
y="Average Weekly Sales (in hundreds of thousands)",
title="Average Weekly Sales Per Year Across All Stores"
) +
scale_x_continuous(breaks = seq(1,53,by=4))
store_data %>%
group_by(Year,Week) %>%
mutate(
avgsales=mean(Weekly_Sales),
scaled=avgsales/100000,
plot_date=as.Date(paste(2013,Month,Day,sep="-"),format="%Y-%m-%d")
) %>%
ggplot() +
geom_line(aes(x=plot_date,y=scaled,color=factor(Year))) +
labs(
x="Month-Day",
y="Average Weekly Sales (in hundreds of thousands)",
title="Average Weekly Sales Per Year Across All Stores"
) +
scale_x_date(date_labels="%m-%d")
# Add other holidays to the store_data. Flag with "2" to differentiate from pre-existing holidays in the store_dataset
# Observation: dates are for end of week, not beginning of week (so take date and check week previous, not week after)
# Also vectorized to be able to use with ifelse
holidays = c(
"2010-05-31", "2011-05-30", "2012-05-28",
"2010-04-04", "2011-04-24", "2012-04-08",
"2010-10-31", "2011-10-31", "2012-10-31",
"2010-07-04", "2011-07-04", "2012-07-04",
"2010-02-14", "2011-02-14", "2012-02-14"
)
determine_holiday = function(day) {
for (holiday in holidays) {
if(between(as.numeric(as.Date(holiday,format="%Y-%m-%d")-day), -6, 0)) {
return(TRUE)
}
}
return(FALSE)
}
# Allows determine_holiday to take in one date at a time in the ifelse
determine_holiday_vectorized <- Vectorize(determine_holiday)
store_data_with_holidays = store_data %>%
mutate(
Holiday_Flag = ifelse(Holiday_Flag==0, ifelse(determine_holiday_vectorized(Date),2,Holiday_Flag),Holiday_Flag)
)
head(store_data_with_holidays %>% filter(Holiday_Flag==2))
## Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
## 1 1 2010-02-19 1611968.2 2 39.93 2.514 211.2891
## 2 2 2010-02-19 2124451.5 2 39.69 2.514 210.9452
## 3 3 2010-02-19 421642.2 2 47.07 2.514 214.6199
## 4 4 2010-02-19 2049860.3 2 36.45 2.540 126.5263
## 5 5 2010-02-19 303447.6 2 41.14 2.514 211.8471
## 6 6 2010-02-19 1567138.1 2 43.58 2.514 212.8162
## Unemployment Year Month Day Week
## 1 8.106 2010 2 19 8
## 2 8.324 2010 2 19 8
## 3 7.368 2010 2 19 8
## 4 8.623 2010 2 19 8
## 5 6.566 2010 2 19 8
## 6 7.259 2010 2 19 8
# 6, 14, 23, 27, 40, 45, 47, 48, 49, 51, 52
store_data_with_holidays %>%
filter(Week %in% c(6,14,23,27,40,45,47,48,49,51,52) | Holiday_Flag %in% c(1,2)) %>%
select(Date,Week,Holiday_Flag) %>%
unique() %>%
select(Date,Week,Holiday_Flag)
## Date Week Holiday_Flag
## 1 2010-02-05 6 0
## 46 2010-02-12 7 1
## 91 2010-02-19 8 2
## 136 2010-04-02 14 0
## 181 2010-04-09 15 2
## 226 2010-06-04 23 2
## 271 2010-07-02 27 0
## 316 2010-07-09 28 2
## 361 2010-09-10 37 1
## 406 2010-10-01 40 0
## 451 2010-11-05 45 2
## 496 2010-11-19 47 0
## 541 2010-11-26 48 1
## 586 2010-12-03 49 0
## 631 2010-12-17 51 0
## 676 2010-12-24 52 0
## 721 2010-12-31 53 1
## 766 2011-02-11 6 1
## 811 2011-02-18 7 2
## 856 2011-04-08 14 0
## 901 2011-04-29 17 2
## 946 2011-06-03 22 2
## 991 2011-06-10 23 0
## 1036 2011-07-08 27 2
## 1081 2011-09-09 36 1
## 1126 2011-10-07 40 0
## 1171 2011-11-04 44 2
## 1216 2011-11-11 45 0
## 1261 2011-11-25 47 1
## 1306 2011-12-02 48 0
## 1351 2011-12-09 49 0
## 1396 2011-12-23 51 0
## 1441 2011-12-30 52 1
## 1486 2012-02-10 6 1
## 1531 2012-02-17 7 2
## 1576 2012-04-06 14 0
## 1621 2012-04-13 15 2
## 1666 2012-06-01 22 2
## 1711 2012-06-08 23 0
## 1756 2012-07-06 27 2
## 1801 2012-09-07 36 1
## 1846 2012-10-05 40 0
# https://search.r-project.org/CRAN/refmans/IDPmisc/html/peaks.html
store_data %>%
ggplot()+geom_point(aes(x=Temperature,y=Weekly_Sales,color=factor(Store)))
store_data %>%
ggplot()+geom_point(aes(x=Fuel_Price,y=Weekly_Sales,color=factor(Store)))
store_data %>%
ggplot()+geom_point(aes(x=CPI,y=Weekly_Sales,color=factor(Store)))
roundedstore_data = store_data %>%
mutate(
RoundedTemp=round(Temperature,digits=-1),
RoundedCPI=round(CPI,digits=-1),
RoundedFuel=round(Fuel_Price,digits=1),
RoundedUnemployment=round(Unemployment,digits=1)
)
roundedstore_data %>%
group_by(Store) %>%
ggplot(aes(x=Store,y=Weekly_Sales)) + geom_point(aes(color=factor(RoundedTemp)))
roundedstore_data %>%
group_by(Store) %>%
ggplot(aes(x=RoundedTemp,y=Weekly_Sales)) + geom_point(aes(color=factor(Store)))
roundedstore_data %>%
group_by(RoundedTemp) %>%
summarize(AvgSales=mean(Weekly_Sales)) %>%
ggplot(aes(x=RoundedTemp,y=AvgSales)) + geom_bar(stat = "identity")
roundedstore_data %>%
group_by(RoundedCPI) %>%
summarize(AvgSales=mean(Weekly_Sales)) %>%
ggplot(aes(x=RoundedCPI,y=AvgSales)) + geom_bar(stat = "identity")
roundedstore_data %>%
group_by(RoundedFuel) %>%
summarize(AvgSales=mean(Weekly_Sales)) %>%
ggplot(aes(x=RoundedFuel,y=AvgSales)) + geom_bar(stat = "identity")
roundedstore_data %>%
group_by(RoundedUnemployment) %>%
summarize(AvgSales=mean(Weekly_Sales)) %>%
ggplot(aes(x=RoundedUnemployment,y=AvgSales)) +
geom_bar(stat = "identity") +
geom_smooth(method="lm", aes(color="red")) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
store_data %>%
group_by(Store) %>%
summarize(CPI=mean(CPI),Unemployment=mean(Unemployment),Fuel_Price=mean(Fuel_Price),Weekly_Sales=mean(Weekly_Sales)) %>%
arrange(desc(Weekly_Sales))
## # A tibble: 45 × 5
## Store CPI Unemployment Fuel_Price Weekly_Sales
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 20 209. 7.37 3.42 2107677.
## 2 4 129. 5.96 3.22 2094713.
## 3 14 186. 8.65 3.42 2020978.
## 4 13 129. 7.00 3.29 2003620.
## 5 2 216. 7.62 3.22 1925751.
## 6 10 129. 8.36 3.58 1899425.
## 7 27 139. 8.00 3.58 1775216.
## 8 6 218. 6.61 3.22 1564728.
## 9 1 216. 7.61 3.22 1555264.
## 10 39 215. 7.87 3.22 1450668.
## # ℹ 35 more rows
store_data_with_holidays %>%
group_by(Date) %>%
ggplot(aes(x=Date,y=Weekly_Sales))+geom_point(aes(color=factor(Holiday_Flag)))
store_data_with_holidays %>%
group_by(Store) %>%
ggplot(aes(x=factor(Store),y=Weekly_Sales))+geom_point(aes(color=factor(Holiday_Flag)))
rows = store_data %>%
group_by(Store,Year) %>%
summarize(n=n())
## `summarise()` has grouped output by 'Store'. You can override using the
## `.groups` argument.
high_low = store_data %>%
left_join(rows,by=c('Store','Year')) %>%
group_by(Store,Year) %>%
mutate(rank=min_rank(-Weekly_Sales)) %>%
filter(rank==1 | rank==n) %>%
arrange(Store, n) %>%
mutate(high=ifelse(rank==1,1,0))
high_low %>%
ggplot(aes(x=Date,y=Weekly_Sales)) + geom_point(aes(color=factor(high), shape=factor(Holiday_Flag))) +
labs(title="Date vs Weekly Sales to Show Most/Least Sales, Accounting for Holidays")
yearly_sales = store_data_with_holidays %>%
group_by(Store, Year) %>%
summarize(Yearly_Sales=sum(Weekly_Sales))
## `summarise()` has grouped output by 'Store'. You can override using the
## `.groups` argument.
store_data_with_holidays = store_data_with_holidays %>%
left_join(yearly_sales, by=c("Store","Year")) %>%
mutate(Percentage_Sales=Weekly_Sales/Yearly_Sales)
head(store_data_with_holidays)
## Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
## 1 1 2010-02-05 1643690.9 0 42.31 2.572 211.0964
## 2 2 2010-02-05 2136989.5 0 40.19 2.572 210.7526
## 3 3 2010-02-05 461622.2 0 45.71 2.572 214.4249
## 4 4 2010-02-05 2135143.9 0 43.76 2.598 126.4421
## 5 5 2010-02-05 317173.1 0 39.70 2.572 211.6540
## 6 6 2010-02-05 1652635.1 0 40.43 2.572 212.6224
## Unemployment Year Month Day Week Yearly_Sales Percentage_Sales
## 1 8.106 2010 2 5 6 73278832 0.02243064
## 2 8.324 2010 2 5 6 95277864 0.02242902
## 3 7.368 2010 2 5 6 18745419 0.02462587
## 4 8.623 2010 2 5 6 95680471 0.02231536
## 5 6.566 2010 2 5 6 14836031 0.02137857
## 6 7.259 2010 2 5 6 76912321 0.02148726
store_data_with_holidays %>%
ggplot()+geom_point(aes(x=Temperature,y=Percentage_Sales,color=factor(Store)))
store_data_with_holidays %>%
ggplot()+geom_point(aes(x=CPI,y=Percentage_Sales,color=factor(Store)))
store_data_with_holidays %>%
ggplot()+geom_point(aes(x=Fuel_Price,y=Percentage_Sales,color=factor(Store)))
store_data_with_holidays %>%
ggplot()+geom_point(aes(x=Unemployment,y=Percentage_Sales,color=factor(Store)))
store_data_with_holidays %>%
group_by(Store) %>%
ggplot(aes(x=factor(Store),y=Percentage_Sales))+geom_point(aes(color=factor(Holiday_Flag)))
rounded_store_data_with_holidays = store_data_with_holidays %>%
mutate(
RoundedTemp=round(Temperature,digits=-1),
RoundedCPI=round(CPI,digits=-1),
RoundedFuel=round(Fuel_Price,digits=1),
RoundedUnemployment=round(Unemployment,digits=1)
)
rounded_store_data_with_holidays %>%
group_by(RoundedUnemployment) %>%
summarize(AvgSales=mean(Percentage_Sales)) %>%
ggplot(aes(x=RoundedUnemployment,y=AvgSales)) +
geom_bar(stat = "identity") +
geom_smooth(method="lm", aes(color="red")) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
rounded_store_data_with_holidays %>%
group_by(RoundedTemp) %>%
summarize(AvgSales=mean(Percentage_Sales)) %>%
ggplot(aes(x=RoundedTemp,y=AvgSales)) +
geom_bar(stat = "identity") +
geom_smooth(method="lm", aes(color="red")) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
rounded_store_data_with_holidays %>%
group_by(RoundedCPI) %>%
summarize(AvgSales=mean(Percentage_Sales)) %>%
ggplot(aes(x=RoundedCPI,y=AvgSales)) +
geom_bar(stat = "identity") +
geom_smooth(method="lm", aes(color="red")) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
rounded_store_data_with_holidays %>%
group_by(RoundedFuel) %>%
summarize(AvgSales=mean(Percentage_Sales)) %>%
ggplot(aes(x=RoundedFuel,y=AvgSales)) +
geom_bar(stat = "identity") +
geom_smooth(method="lm", aes(color="red")) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'